import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.simplefilter("ignore")
mydata = pd.read_csv("vehicle-1.csv")
mydata_copy = mydata.copy()
mydata.head()
mydata.isnull().sum()
cols = ['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio']
for feature in cols:
mydata[feature] = mydata[feature].fillna(mydata[feature].median())
mydata.isnull().sum()
There are no outliers for Compactness
There are no outliers for Circularity
There are no outliers for distance_circularity
There are no outliers for scatter_ratio
There are no outliers for elongatedness
There are no outliers for pr.axis_rectangularity
There are no outliers for max.length_rectangularity
There are no outliers for scaled_radius_of_gyration
There are no outliers for skewness_about.2
There are no outliers for hollows_ratio
There are outliers for radius_ratio
There are outliers for pr.axis_aspect_ratio
There are outliers for max.length_aspect_ratio
There are outliers for scaled_variance
There are outliers for scaled_variance.1
There are outliers for scaled_radius_of_gyration.1
There are outliers for skewness_about
There are outliers for skewness_about.1
for feature in cols:
plt.figure(figsize = (6,6))
mydata.boxplot([feature])
from scipy import stats
mydata['zradius_ratio'] = np.abs(stats.zscore(mydata['radius_ratio']))
mydata['zpr.axis_aspect_ratio'] = np.abs(stats.zscore(mydata['pr.axis_aspect_ratio']))
mydata['zmax.length_aspect_ratio'] = np.abs(stats.zscore(mydata['max.length_aspect_ratio']))
mydata['zscaled_variance'] = np.abs(stats.zscore(mydata['scaled_variance']))
mydata['zscaled_variance.1'] = np.abs(stats.zscore(mydata['scaled_variance.1']))
mydata['zscaled_radius_of_gyration.1'] = np.abs(stats.zscore(mydata['scaled_radius_of_gyration.1']))
mydata['zskewness_about'] = np.abs(stats.zscore(mydata['skewness_about']))
mydata['zskewness_about.1'] = np.abs(stats.zscore(mydata['skewness_about.1']))
mydata.head()
df_clean = mydata[mydata['zradius_ratio'] <= 3]
print("%i records have been removed after treating radius_ratio" %(mydata.shape[0]-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
df_clean = df_clean[df_clean['zpr.axis_aspect_ratio'] <= 3]
print("%i records have been removed after treating zpr.axis_aspect_ratio" %(record-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
df_clean = df_clean[df_clean['zmax.length_aspect_ratio'] <= 3]
print("%i records have been removed after treating zmax.length_aspect_ratio" %(record-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
df_clean = df_clean[df_clean['zscaled_variance'] <= 3]
print("%i records have been removed after treating zscaled_variance" %(record-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
df_clean = df_clean[df_clean['zscaled_variance.1'] <= 3]
print("%i records have been removed after treating zscaled_variance.1" %(record-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
df_clean = df_clean[df_clean['zscaled_radius_of_gyration.1'] <= 3]
print("%i records have been removed after treating zscaled_radius_of_gyration.1" %(record-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
df_clean = df_clean[df_clean['zskewness_about'] <= 3]
print("%i records have been removed after treating zskewness_about" %(record-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
df_clean = df_clean[df_clean['zskewness_about.1'] <= 3]
print("%i records have been removed after treating zskewness_about.1" %(record-df_clean.shape[0]))
record = df_clean.shape[0]
print("Total Records - %i" %(record))
mydata_copy = df_clean.copy()
a = df_clean.copy()
There is a significant reduction in the outliers for all the columns
col = ['zradius_ratio',
'zpr.axis_aspect_ratio', 'zmax.length_aspect_ratio', 'zscaled_variance',
'zscaled_variance.1', 'zscaled_radius_of_gyration.1', 'zskewness_about',
'zskewness_about.1', 'class']
df_clean.drop(col, axis = 1, inplace = True)
for feature in df_clean.columns:
plt.figure(figsize = (6,6))
df_clean.boxplot(feature)
col = ['zradius_ratio',
'zpr.axis_aspect_ratio', 'zmax.length_aspect_ratio', 'zscaled_variance',
'zscaled_variance.1', 'zscaled_radius_of_gyration.1', 'zskewness_about',
'zskewness_about.1']
a.drop(col, axis = 1, inplace = True)
a.describe(include = 'all').transpose()
#There are 3 unique values for the 'class' variable. Car has the highest count
We find that most of the independent variables are having a very high positive correlation
a.corr()
From the pairplot, we get almost the same inferences from the correlation matrix.
sns.pairplot(a)
We will drop the variables which has a correlation value of > 0.8 because one variable explains the other and there is no need to have both the variables in the same dataset. Here, we require domain experience as well to factor if we are missing out any relevant information while dropping the variables. Based on the correlation numbers, we decide to drop out the following:
elongatedness
pr.axis_rectangularity
max.length_rectangularity
scaled_radius_of_gyration
skewness_about.2
scatter_ratio
scaled_variance
scaled_variance.1
Note: Of course, there will be a higher accuracy if we include all the variables but it includes the noise generated by them as well. We need to strike a balance to not lose the information and on the other hand having minumum variables for the data set. This is a trade off to cater to the curse of dimentionality problem. Too many variables and lesser rows of data will lead to such a problem.
rem = ['max.length_rectangularity','scaled_radius_of_gyration','skewness_about.2','scatter_ratio','elongatedness','pr.axis_rectangularity',
'scaled_variance','scaled_variance.1']
df_clean.drop(rem,axis = 1, inplace = True)
Data needs to be scaled for any distance based algorithm such as clustering
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_clean = sc.fit_transform(df_clean)
Cluster = 3 has the Silhoutte Score of 0.2399 and a good dip in average distortion
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
clusters = range(2,10)
meandistortion = []
for k in clusters:
model = KMeans(n_clusters = k)
model = model.fit(df_clean)
prediction = model.predict(df_clean)
meandistortion.append(sum(np.min(cdist(df_clean,model.cluster_centers_, 'euclidean'), axis = 1))/df_clean.shape[0])
print("For Cluster = %i, the Silhouette Score is %1.4f" %(k,silhouette_score(df_clean,model.labels_)))
plt.plot(clusters, meandistortion, 'bx-')
plt.xlabel('k - Number of Clusters')
plt.ylabel('Average Distortion')
plt.title('Selecting k with the Elbow method')
clus = KMeans(n_clusters = 3, random_state = 1)
clus.fit_predict(df_clean)
col = ['zradius_ratio',
'zpr.axis_aspect_ratio', 'zmax.length_aspect_ratio', 'zscaled_variance',
'zscaled_variance.1', 'zscaled_radius_of_gyration.1', 'zskewness_about',
'zskewness_about.1']
mydata_copy.drop(col,axis = 1, inplace = True)
plt.scatter(df_clean[:,0], df_clean[:,1], c=clus.labels_)
plt.show()
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(df_clean, 'ward', metric = 'euclidean')
Z.shape
plt.figure(figsize=(25, 10))
dendrogram(Z)
plt.show()
dendrogram(Z,truncate_mode='lastp',p=3)
plt.show()
from scipy.cluster.hierarchy import fcluster
max_d=40
clusters = fcluster(Z, max_d, criterion='distance')
plt.scatter(df_clean[:,0], df_clean[:,1], c=clusters) # plot points with cluster dependent colors
plt.show()
mydata_copy.head()
mydata_copy['HCluster'] = clusters
mydata_copy['KCluster'] = clus.labels_
df_final = mydata_copy.copy()
df_final.drop('class', inplace = True, axis = 1)
df_final.groupby('HCluster').median()
df_final.groupby('KCluster').median()
df_final['HC'] = np.where(df_final['HCluster'] == 1, 1, np.where(df_final['HCluster'] == 2, 2, 0))
df_final['Match'] = np.where(df_final['KCluster'] == df_final['HC'], "True", "False")
df_final.groupby('Match')['Match'].count()
print("Accuracy percentage of match between KMeans and HCluster is %2.2f" %(100*df_final.groupby('Match')['Match'].count()[1]/df_final.shape[0]))
c = mydata_copy.copy()
c.drop(['HCluster', 'KCluster'], axis = 1, inplace = True)
c.drop(rem, axis = 1, inplace = True)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
c['class'] = le.fit_transform(c['class'])
y = c['class']
X = c.drop(['class'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size = 0.3)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import metrics
svm = SVC(probability = True, random_state = 0)
svm.fit(X_train, y_train)
svmpredict = svm.predict(X_test)
print(classification_report(y_test, svmpredict))
print("Accuracy Score is %5.3f " %(accuracy_score(y_test, svmpredict) * 100))
cm_svm = metrics.confusion_matrix(y_test, svmpredict, labels = [2,1,0])
df_cm_svm = pd.DataFrame(cm_svm, index = [i for i in ["2","1", "0"]], columns = [i for i in ["Predict 2", "Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm_svm, annot = True, cmap = "Greens", fmt='g')
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = svm, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
from sklearn.decomposition import PCA
pca = PCA(n_components = 6, random_state = 0)
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
variance = pca.explained_variance_ratio_
print(variance)
svmpca = SVC(probability = True, random_state = 0)
svmpca.fit(PCAX_train, y_train)
svmpredictpca = svmpca.predict(PCAX_test)
print(classification_report(y_test, svmpredictpca))
print("Accuracy Score is %5.3f " %(accuracy_score(y_test, svmpredictpca) * 100))
cm_svm = metrics.confusion_matrix(y_test, svmpredictpca, labels = [2,1,0])
df_cm_svm = pd.DataFrame(cm_svm, index = [i for i in ["2","1", "0"]], columns = [i for i in ["Predict 2", "Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm_svm, annot = True, cmap = "Greens", fmt='g')
accuraciespca = cross_val_score(estimator = svmpca, X = PCAX_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuraciespca.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuraciespca.std()*100))
for components in range (2,10):
pca = PCA(n_components = components, random_state = 0)
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
svmpca.fit(PCAX_train, y_train)
svmpredictpca1 = svmpca.predict(PCAX_test)
accuraciespca = cross_val_score(estimator = svmpca, X = PCAX_train, y = y_train, cv = 10)
print("For %i Components:" %(components))
print("Accuracy: {:.2f} %".format(accuraciespca.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuraciespca.std()*100))
print()
mean=[]
std=[]
for components in range (2,10):
pca = PCA(n_components = components, random_state = 0)
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
svmpca.fit(PCAX_train, y_train)
svmpredictpca1 = svmpca.predict(PCAX_test)
accuraciespca = cross_val_score(estimator = svmpca, X = PCAX_train, y = y_train, cv = 10)
mean.append(accuraciespca.mean()*100)
Ideal PCA Components are 6
plt.plot(range(2,10), mean, label = "Mean")
plt.xlabel("PCA Components")
plt.ylabel("Model Accuracy Percentage")
plt.title("PCA Components Vs Model Accuracy %")
plt.legend(loc = 4)
plt.show()
sc = StandardScaler()
X_standard = sc.fit_transform(X)
covariance_matrix = np.cov(X_standard.T)
print(covariance_matrix)
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print(eigenvalues)
print(eigenvectors)
eigenpair = [(eigenvalues[i], eigenvectors[:,i]) for i in range(len(eigenvalues))]
print(eigenpair)
eigenpair.sort()
eigenpair.reverse()
print(eigenpair)
eigenvalues_sorted = [eigenpair[i][0] for i in range(len(eigenpair))]
eigenvectors_sorted = [eigenpair[i][1] for i in range(len(eigenpair))]
total_variance = sum(eigenvalues_sorted)
for i in range(len(eigenvalues_sorted)):
variance_explained = eigenvalues_sorted[i]/total_variance
print(variance_explained)
variance_explained = [(eigenvalues_sorted[i]/total_variance)for i in range(len(eigenvalues_sorted))]
cumulative_variance = np.cumsum(variance_explained)
plt.bar(range(len(eigenvalues_sorted)), variance_explained, label = "Individual Variance Explained")
plt.step(range(len(eigenvalues_sorted)), cumulative_variance, label = "Cumulative Variance Explained")
plt.legend(loc = 'best')
PCAReduced = np.array(eigenvectors_sorted[0:6])
X_PCAReduced = np.dot(X_standard,PCAReduced.T)
df_PCAReduced = pd.DataFrame(X_PCAReduced)
sns.pairplot(df_PCAReduced)
There is no correlation and each independent variable is 'independent'
df_PCAReduced.corr().round(5)
print(classification_report(y_test, svmpredict))
print("Accuracy Score is %5.3f " %(accuracy_score(y_test, svmpredict) * 100))
print(classification_report(y_test, svmpredictpca))
print("Accuracy Score is %5.3f " %(accuracy_score(y_test, svmpredictpca) * 100))
SVM Before PCA
Now, when we do a PCA we are trying to eliminate the noise from the data and forming a new linear combination of independent variables into fewer variables.
The eigenvalue is the sum of maximum distance from the origin to the point where the data is projected onto the eigen vector.
The eigen vector when dot matrix multiplied to the original data set, we arrive at the PCA reduced dataset.
When we are reducing the number of variables, we can expect a reduction in the accuracy scores but at a benefit of handling lesser independent variables.
SVM After PCA
PCA does a good job and provides us a good accuracy score, when using production data we have to subject to the PCA reduction and then apply the classification algorithms.